Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path= './data/NCDC/europe/uk/marham/dat.txt' 
# file_path, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 4 
# file_path, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 4
# file_path= './data/NCDC/europe/uk/middle_wallop/dat.txt' 
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 4 
# file_path= "./data/NCDC/europe/germany/neuburg/dat.txt"
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # time shift
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path= './data/NCDC/europe/spain/almeria/dat.txt' # negative dimensions?
# file_path= './data/NCDC/europe/greece/eleftherios_intl/dat.txt'
# file_path= './data/NCDC/europe/greece/elefsis/dat.txt' # bad dataset
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" , 0.6
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend, try 2
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, incomplete dataset

# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
# file_path = './data/asos/topeka/hr_avg.csv' # High 0

# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19850312 1400 FM-18 130 13.0 N
1 19850312 1500 FM-18 130 12.0 N
2 19850312 1600 FM-18 130 12.0 N
3 19850312 1700 FM-18 160 7.0 N
4 19850312 1800 FM-18 170 8.0 N
5 19850312 1900 FM-18 190 7.0 N
6 19850312 2000 FM-18 210 6.0 N
7 19850312 2100 FM-18 190 4.0 N
8 19850312 2200 FM-18 170 6.0 N
9 19850312 2300 FM-18 270 8.0 N
10 19850313 0000 FM-18 270 7.0 N
11 19850313 0100 FM-18 240 6.0 N
12 19850313 0200 FM-18 230 6.0 N
13 19850313 0300 FM-18 220 4.0 N
14 19850313 0400 FM-18 230 6.0 N
15 19850313 0500 FM-18 240 6.0 N
16 19850313 0600 FM-18 250 6.0 N
17 19850313 0700 FM-18 240 8.0 N
18 19850313 0800 FM-18 260 10.0 N
19 19850313 0900 FM-18 240 10.0 N
20 19850313 1000 FM-18 260 10.0 N
21 19850313 1100 FM-18 250 10.0 N
22 19850313 1200 FM-18 260 11.0 N
23 19850313 1300 FM-18 260 10.0 N
24 19850313 1400 FM-18 290 11.0 N
25 19850313 1500 FM-18 290 12.0 N
26 19850313 1600 FM-18 310 13.0 N
27 19850313 1800 FM-18 300 11.0 N
28 19850313 1900 FM-18 290 11.0 N
29 19850313 2000 FM-18 290 12.0 N
... ... ... ... ... ... ...
235836 20170331 1800 FM-13 110 2.0 N
235837 20170331 1900 FM-13 999 999.9 C
235838 20170331 2000 FM-13 80 4.0 N
235839 20170331 2100 FM-13 90 4.0 N
235840 20170331 2200 FM-13 70 6.0 N
235841 20170331 2300 FM-13 80 6.0 N
235842 20170401 0000 FM-13 90 8.0 N
235843 20170401 0100 FM-13 80 8.0 N
235844 20170401 0200 FM-13 80 10.0 N
235845 20170401 0300 FM-13 90 11.0 N
235846 20170401 0400 FM-13 80 11.0 N
235847 20170401 0500 FM-13 80 11.0 N
235848 20170401 0600 FM-13 80 11.0 N
235849 20170401 0700 FM-13 80 12.0 N
235850 20170401 0800 FM-13 80 12.0 N
235851 20170401 0900 FM-13 70 11.0 N
235852 20170401 1000 FM-13 70 12.0 N
235853 20170401 1100 FM-13 70 12.0 N
235854 20170401 1200 FM-13 60 13.0 N
235855 20170401 1300 FM-13 50 12.0 N
235856 20170401 1400 FM-13 50 12.0 N
235857 20170401 1500 FM-13 40 13.0 N
235858 20170401 1600 FM-13 40 14.0 N
235859 20170401 1700 FM-13 30 14.0 N
235860 20170401 1800 FM-13 30 14.0 N
235861 20170401 1900 FM-13 30 14.0 N
235862 20170401 2000 FM-13 30 14.0 N
235863 20170401 2100 FM-13 30 15.0 N
235864 20170401 2200 FM-13 30 14.0 N
235865 20170401 2300 FM-13 10 14.0 N

235866 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
42.35 -70.69
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
date HrMn dir speed month dir_windrose
count 2.334610e+05 233461.000000 233461.000000 233461.000000 233461.000000 233461.000000
mean 2.000936e+07 1147.748875 196.144380 6.089394 6.566806 207.081881
std 9.282218e+04 692.306640 136.124931 3.331242 3.503616 139.864066
min 1.985031e+07 0.000000 0.000000 0.000000 1.000000 0.000000
25% 1.993041e+07 500.000000 120.000000 4.000000 3.000000 120.000000
50% 2.000113e+07 1100.000000 190.000000 6.000000 7.000000 210.000000
75% 2.009073e+07 1700.000000 260.000000 8.000000 10.000000 280.000000
max 2.016123e+07 2300.000000 999.000000 26.000000 12.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xd708940>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
    knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
False
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xe38eb38>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type month dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 3 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 9 170 6.0 2.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 12 40 2.0 1.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 3 70 2.0 -3.0
1992-12-11 18:00:00 19921211 1800 FM-18 10 22.0 N 12 80 2.0 2.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 12 30 2.0 0.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 9 180 -2.0 2.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 12 30 0.0 2.0
1992-12-13 02:00:00 19921213 200 FM-18 40 22.0 N 12 50 1.0 3.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 12 50 -1.0 0.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd66d4e0>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 1
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 3 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 9 170 6.0 2.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 3 70 2.0 -3.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 12 40 2.0 1.0
1986-12-19 11:00:00 19861219 1100 FM-18 70 22.0 N 12 20 2.0 1.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 12 30 0.0 2.0
1992-12-12 16:00:00 19921212 1600 FM-18 50 22.0 N 12 40 0.0 3.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 9 180 -2.0 2.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 12 50 -1.0 0.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 12 30 2.0 0.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0      3268
10     2973
20     3265
30     3460
40     3945
50     4485
60     4641
70     4754
80     4396
90     4144
100    3936
110    4522
120    5801
130    6521
140    7334
150    7647
160    7957
170    8295
180    8399
190    6724
200    6296
210    6639
220    6665
230    7142
240    7168
250    7362
260    6459
270    5605
280    4961
290    5359
300    6069
310    5671
320    4648
330    4157
340    3575
350    3125
999    3636
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0104086353123
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0.0    3636
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1985 - 1989
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1990 - 1994
1995 - 1999
2000 - 2004
2006 - 2009
2010 - 2013
In [24]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[24]:
(0, 10.0)
In [25]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type month dir_windrose
time
In [26]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? False
Report type used: FM-18
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed month dir_windrose x y
count 2.216000e+04 22160.000000 22160.000000 22160.000000 22160.000000 22160.000000 22160.000000 22160.000000
mean 2.011058e+07 1146.841155 183.364229 6.641272 6.739305 206.230144 -1.545472 0.360532
std 9.095691e+03 692.277481 90.802087 3.408135 3.494046 123.420179 5.165248 5.150290
min 2.010010e+07 0.000000 -4.948656 0.018183 1.000000 0.000000 -17.812196 -17.797982
25% 2.010093e+07 500.000000 121.841969 4.096309 4.000000 130.000000 -5.073480 -3.543385
50% 2.011052e+07 1100.000000 182.313806 5.986295 7.000000 210.000000 -1.589137 -0.212269
75% 2.012071e+07 1700.000000 257.901744 8.798595 10.000000 290.000000 2.043028 3.928375
max 2.013043e+07 2300.000000 354.984989 21.805785 12.000000 999.000000 20.663463 21.048884
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x13e2f9e8>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x13e4a438>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month

In [41]:
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration

In [42]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [43]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4
   5   6   7   8   9  10  11  12  13]
In [44]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
0.9
Wall time: 29.7 s
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.9 729
[  2.77043071e-09   4.75180819e-09   5.93878546e-08   4.87303176e-07
   1.61480480e-06]
In [48]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [49]:
kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization

In [50]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [51]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[51]:
weight mean_x mean_y sig_x sig_y corr
1 0.469 -0.740 -3.303 3.501 2.921 0.026
2 0.315 -6.233 3.611 3.655 3.836 -0.009
3 0.216 3.526 3.572 4.268 5.284 -0.049
In [52]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.468926919487 [[-0.73961337 -3.30280626]] [ 2.91799961  3.50342649] -85.9278697201
0.314699800509 [[-6.23324591  3.61124102]] [ 3.65353877  3.83762123] -174.690818753
0.216373280005 [[ 3.52610226  3.57183843]] [ 4.25376396  5.295953  ] -173.590971788
In [53]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [54]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[54]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.966 0.016 0.031 9.030760e-08 0.037 0.224

6. GMM by Optimization

In [55]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [56]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[56]:
     fun: -16.748215402416456
     jac: array([  6.57759666e-01,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,   0.00000000e+00,  -2.38418579e-07,
         6.57761097e-01,   0.00000000e+00,   2.38418579e-07,
        -2.38418579e-07,  -4.76837158e-07,  -7.15255737e-07,
         6.57760382e-01,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,  -2.38418579e-07,  -4.76837158e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 871
     nit: 43
    njev: 43
  status: 0
 success: True
       x: array([ 0.21552353,  3.57528705,  1.78909983,  3.0374554 ,  5.05507849,
        0.13286722,  0.21943653, -0.10399873, -3.84824076,  3.0176846 ,
        2.13295069, -0.16287579,  0.56503994, -4.58349928,  1.43375139,
        4.23658902,  5.49671426, -0.27634076])

6.1 GMM Result

In [57]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.565 -4.583 1.434 4.237 5.497 -0.276
2 0.219 -0.104 -3.848 3.018 2.133 -0.163
3 0.216 3.575 1.789 3.037 5.055 0.133
In [58]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.565039944377 [[-4.58349928  1.43375139]] [ 3.89767953  5.74200736] -156.810265409
0.219436525716 [[-0.10399873 -3.84824076]] [ 2.0784283   3.05549256] -102.353967648
0.215523529907 [[ 3.57528705  1.78909983]] [ 2.9958447   5.07984925] 172.984654801

6.2 Goodness-of-fit statistics

In [59]:
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.013 0.063 5.325280e-08 0.029 0.172
In [60]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [61]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='', ylabel='', colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='', ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [62]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
In [63]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
In [64]:
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
Speed Distribution Comparison
In [65]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)
Direction Distribution Comparison
In [66]:
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.909029760727

6.3 Sectoral Comaprison

In [67]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
# #         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection
In [68]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 510 weight 0.023014440433212997
GMM Weibull
R square 0.831853615978 0.924503248255
max diff: 0.155050422821 0.0668158813089 speed value: 8.95319226392 3.35744709897 y gmm 0.915834736547
 
25.0 (15.0 - 35.0) degree
data size: 795 weight 0.03587545126353791
GMM Weibull
R square 0.938586899248 0.944267060155
max diff: 0.0619827308519 0.0565823540399 speed value: 9.66186965721 3.22062321907 y gmm 0.899718579908
 
45.0 (35.0 - 55.0) degree
data size: 868 weight 0.03916967509025271
GMM Weibull
R square 0.830685915562 0.913364924761
max diff: 0.080778628579 0.0387557629903 speed value: 11.4739411191 6.88436467144 y gmm 0.913727937335
 
65.0 (55.0 - 75.0) degree
data size: 988 weight 0.044584837545126356
GMM Weibull
R square 0.827425489353 0.86994753252
max diff: 0.0974624586288 0.0460219119217 speed value: 9.18138320056 5.73836450035 y gmm 0.710822782515
 
85.0 (75.0 - 95.0) degree
data size: 812 weight 0.03664259927797834
GMM Weibull
R square 0.869139166031 0.890423432052
max diff: 0.0907753548831 0.0348315498478 speed value: 10.278303636 10.278303636 y gmm 0.83954382779
 
105.0 (95.0 - 115.0) degree
data size: 887 weight 0.04002707581227437
GMM Weibull
R square 0.925826253695 0.977865831355
max diff: 0.076980634698 0.0475580591216 speed value: 10.3438001202 5.17190006011 y gmm 0.756164799349
 
125.0 (115.0 - 135.0) degree
data size: 1639 weight 0.07396209386281588
GMM Weibull
R square 0.877060668491 0.939512788871
max diff: 0.0954038351094 0.096189634633 speed value: 10.7596935653 6.45581613916 y gmm 0.673357604793
 
145.0 (135.0 - 155.0) degree
data size: 1966 weight 0.08871841155234657
GMM Weibull
R square 0.924560662011 0.9590352199
max diff: 0.052793371228 0.185265367165 speed value: 12.4661804989 9.34963537417 y gmm 0.817501644031
 
165.0 (155.0 - 175.0) degree
data size: 1904 weight 0.08592057761732852
GMM Weibull
R square 0.92201498778 0.933729450422
max diff: 0.0507256556212 0.0935438637231 speed value: 8.24416552531 5.15260345332 y gmm 0.56753237831
 
185.0 (175.0 - 195.0) degree
data size: 1540 weight 0.06949458483754513
GMM Weibull
R square 0.78393013152 0.867600821095
max diff: 0.101706051574 0.0592194017124 speed value: 6.20024329564 6.20024329564 y gmm 0.449592649724
 
205.0 (195.0 - 215.0) degree
data size: 1365 weight 0.06159747292418773
GMM Weibull
R square 0.919547496016 0.937241353317
max diff: 0.0682388876424 0.059355208928 speed value: 5.68886537487 5.68886537487 y gmm 0.468024848621
 
225.0 (215.0 - 235.0) degree
data size: 1417 weight 0.06394404332129965
GMM Weibull
R square 0.96794710049 0.9470026053
max diff: 0.0200611302168 0.0608998833517 speed value: 10.6478236966 5.73344352896 y gmm 0.936890175358
 
245.0 (235.0 - 255.0) degree
data size: 1543 weight 0.06962996389891697
GMM Weibull
R square 0.983164526285 0.980181289385
max diff: 0.0333915617798 0.0285589846367 speed value: 6.90191281969 5.91592527402 y gmm 0.719716902026
 
265.0 (255.0 - 275.0) degree
data size: 1418 weight 0.06398916967509025
GMM Weibull
R square 0.960004778728 0.957065322793
max diff: 0.0833719749421 0.0489794566841 speed value: 5.70118816076 5.70118816076 y gmm 0.572481339585
 
285.0 (275.0 - 295.0) degree
data size: 1562 weight 0.07048736462093863
GMM Weibull
R square 0.943493804672 0.956386212115
max diff: 0.0725677637151 0.0629582544097 speed value: 6.43728446483 2.75883619921 y gmm 0.673270904659
 
305.0 (295.0 - 315.0) degree
data size: 1240 weight 0.05595667870036101
GMM Weibull
R square 0.9524177808 0.924485641347
max diff: 0.0379643480288 0.0521379530509 speed value: 4.77939822492 6.69115751489 y gmm 0.409616297133
 
325.0 (315.0 - 335.0) degree
data size: 837 weight 0.037770758122743685
GMM Weibull
R square 0.878263329892 0.920386925562
max diff: 0.0673901319242 0.181312378804 speed value: 5.24964168497 3.14978501098 y gmm 0.521618231278
 
345.0 (335.0 - 355.0) degree
data size: 694 weight 0.03131768953068592
GMM Weibull
R square 0.865764031098 0.923550602423
max diff: 0.091481222377 0.153108498423 speed value: 8.77099298435 2.19274824609 y gmm 0.925775170504
 
Wall time: 47.1 s
In [69]:
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9074193879039459 0.9345672265848012
In [70]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.06921797900922057 0.07831111435894395
In [71]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [72]:
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle
In [73]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [74]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
5.0 (-5.0 - 15.0) Degree Speed Distribution
0.128101695595 9.0 0.918297774026

6.4.2 Time Variability

In [75]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
5.0 (-5.0 - 15.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [76]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [77]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
5.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [78]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.9 square_error

7.1 Variability of the Result

In [79]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.573 -4.491 1.413 4.287 5.449 -0.283
2 0.221 -0.077 -3.898 3.075 2.075 -0.102
3 0.206 3.697 1.999 2.916 5.021 0.166
GMM Plot Result
0.573498755416 [[-4.49075743  1.41267105]] [ 3.91530611  5.72174132] -155.26771496
0.22068697109 [[-0.07690099 -3.89848995]] [ 2.05557474  3.08815254] -97.072286055
0.205814273494 [[ 3.6972301   1.99894789]] [ 2.85596257  5.05484351] 171.911231541
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.016 0.088 6.169562e-08 0.030 0.185
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.014 0.073 5.470885e-08 0.029 0.174

weight mean_x mean_y sig_x sig_y corr
1 0.574 -4.442 1.544 4.307 5.511 -0.243
2 0.227 -0.232 -3.781 3.112 2.181 -0.199
3 0.199 3.672 1.575 2.742 5.091 0.201
GMM Plot Result
0.573827333022 [[-4.44230179  1.54428468]] [ 4.02610267  5.71993871] -157.867380718
0.227044117174 [[-0.23161838 -3.78127921]] [ 2.10070646  3.16707232] -104.341427268
0.199128549804 [[ 3.67233487  1.57505537]] [ 2.66522577  5.13157826] 171.524577981
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.016 0.122 5.928902e-08 0.031 0.181
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.016 0.111 5.450419e-08 0.029 0.174

weight mean_x mean_y sig_x sig_y corr
1 0.586 -4.355 1.554 4.411 5.410 -0.263
2 0.224 -0.192 -3.891 3.005 2.113 -0.182
3 0.191 3.886 1.662 2.915 5.061 0.182
GMM Plot Result
0.585576800473 [[-4.35541442  1.55376869]] [ 4.04900866  5.68603448] -154.008837678
0.22358405068 [[-0.19163745 -3.89091139]] [ 2.04646503  3.04992566] -103.401250134
0.190839148848 [[ 3.88648654  1.66238916]] [ 2.84385223  5.10161593] 171.295731728
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.011 0.061 5.918320e-08 0.030 0.181
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.066 5.476795e-08 0.029 0.174

weight mean_x mean_y sig_x sig_y corr
1 0.582 -4.470 1.578 4.374 5.444 -0.242
2 0.223 -0.182 -3.825 3.032 2.215 -0.161
3 0.196 3.689 1.513 2.723 5.151 0.240
GMM Plot Result
0.581812304143 [[-4.46956689  1.57822633]] [ 4.07327935  5.67273877] -156.194847639
0.222560761389 [[-0.18213164 -3.8247706 ]] [ 2.1563309   3.07365939] -103.391340203
0.195626934468 [[ 3.68917953  1.51251326]] [ 2.61542276  5.2066828 ] 170.3076751
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.014 0.118 5.787179e-08 0.030 0.179
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.016 0.118 5.496006e-08 0.029 0.174

weight mean_x mean_y sig_x sig_y corr
1 0.590 -4.393 1.583 4.432 5.350 -0.268
2 0.227 -0.286 -3.943 3.022 2.177 -0.088
3 0.183 3.796 1.809 2.676 5.107 0.156
GMM Plot Result
0.589955029365 [[-4.39339255  1.58306952]] [ 4.04388096  5.64941946] -152.614354769
0.227020154519 [[-0.28594332 -3.94329338]] [ 2.16004252  3.03445133] -97.3853578382
0.183024816116 [[ 3.79606675  1.80868377]] [ 2.63113432  5.12985809] 173.633109918
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.013 0.108 5.825870e-08 0.030 0.179
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.017 0.113 5.573080e-08 0.029 0.176

weight mean_x mean_y sig_x sig_y corr
1 0.584 -4.459 1.432 4.336 5.516 -0.253
2 0.217 -0.124 -3.835 3.058 2.122 -0.178
3 0.199 3.604 1.672 2.826 5.104 0.175
GMM Plot Result
0.583968453871 [[-4.45877565  1.43185072]] [ 4.02720378  5.74553144] -156.917535335
0.216947741788 [[-0.12358384 -3.83533049]] [ 2.05965269  3.1006613 ] -102.720934572
0.19908380434 [[ 3.60381118  1.67216946]] [ 2.76336475  5.13826459] 172.180974048
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.015 0.090 5.949958e-08 0.030 0.182
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.015 0.095 5.386887e-08 0.029 0.173

weight mean_x mean_y sig_x sig_y corr
1 0.531 -4.872 1.455 4.055 5.530 -0.284
2 0.249 3.202 1.686 3.332 5.086 0.173
3 0.220 -0.025 -3.834 3.060 2.130 -0.182
GMM Plot Result
0.531338008475 [[-4.87199371  1.4553973 ]] [ 3.74140647  5.7466977 ] -158.992768237
0.248968730319 [[ 3.20209214  1.68643146]] [ 3.24658842  5.14067264] 169.151968001
0.219693261206 [[-0.02536693 -3.83354553]] [ 2.06400777  3.10499292] -103.073088141
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.013 0.060 5.809991e-08 0.030 0.179
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.010 0.053 5.388960e-08 0.029 0.173

weight mean_x mean_y sig_x sig_y corr
1 0.558 -4.648 1.190 4.052 5.584 -0.295
2 0.234 3.287 1.889 3.367 5.014 0.087
3 0.208 0.218 -3.886 2.970 2.066 -0.116
GMM Plot Result
0.55790364907 [[-4.64769641  1.18994963]] [ 3.72116223  5.80940307] -158.943322661
0.233826593225 [[ 3.28727054  1.8885243 ]] [ 3.34412917  5.02936092] 173.995218924
0.208269757705 [[ 0.21772828 -3.8862783 ]] [ 2.03967133  2.98867952] -98.6737790192
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.012 0.047 5.397457e-08 0.029 0.173
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.013 0.046 5.491841e-08 0.029 0.174

weight mean_x mean_y sig_x sig_y corr
1 0.684 -2.539 2.256 6.516 4.940 -0.189
2 0.186 1.110 -4.050 2.192 2.062 0.080
3 0.130 -3.596 -2.789 1.866 3.146 -0.180
GMM Plot Result
0.683581874106 [[-2.53926297  2.25601777]] [ 4.74879045  6.6566675 ] -106.982379558
0.186046241183 [[ 1.11002552 -4.05036228]] [ 2.0182592  2.2317759] -63.7826995498
0.130371884711 [[-3.59555476 -2.78884965]] [ 1.82012699  3.17300756] -170.89003259
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.026 0.048 5.514197e-08 0.028 0.174
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.025 0.050 5.431908e-08 0.029 0.173

weight mean_x mean_y sig_x sig_y corr
1 0.510 -4.998 1.498 4.006 5.634 -0.297
2 0.268 2.945 1.623 3.751 4.941 0.143
3 0.222 0.115 -3.892 2.991 2.135 -0.115
GMM Plot Result
0.50987502213 [[-4.99768625  1.49814234]] [ 3.68411209  5.8494859 ] -159.753797777
0.267715653136 [[ 2.94548029  1.62299409]] [ 3.66479752  5.00530721] 166.428440464
0.222409324735 [[ 0.11523195 -3.89185148]] [ 2.10667815  3.01076766] -99.2723874586
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.010 0.047 6.088596e-08 0.030 0.184
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.040 5.519387e-08 0.029 0.175
Wall time: 14.2 s

7.2 Cross-validation, to select the number of Gaussian

In [80]:
# df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn%400 == 0)')
In [81]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset 16620.0 5540.0
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.161283 0.070286 5.232743e-07 0.090387 0.538181 0.806027
1 0.161615 0.071153 5.443462e-07 0.090243 0.549347 0.796359
2 0.161828 0.069304 5.172227e-07 0.090207 0.534489 0.805971
3 0.157598 0.068001 5.139975e-07 0.090678 0.533296 0.805925
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.174387 0.059709 5.011200e-07 0.087954 0.526430 0.807645
1 0.164540 0.071689 5.053075e-07 0.093062 0.527366 0.811157
2 0.170172 0.071582 5.673245e-07 0.091407 0.561925 0.790242
3 0.178380 0.075763 5.890364e-07 0.092040 0.571042 0.786484
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.065659 0.021114 8.428406e-08 0.036320 0.215844 0.968182
1 0.072354 0.020953 8.461223e-08 0.036154 0.216455 0.968209
2 0.075168 0.021383 8.611948e-08 0.037208 0.218290 0.967810
3 0.075363 0.020292 9.220351e-08 0.037220 0.225962 0.965854
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.091661 0.035722 1.237991e-07 0.043132 0.262193 0.955168
1 0.093169 0.022349 1.152451e-07 0.042860 0.252302 0.957595
2 0.077515 0.023429 1.114449e-07 0.039512 0.248395 0.958360
3 0.087269 0.026655 8.861248e-08 0.038155 0.221220 0.965903
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.055706 0.012489 5.736852e-08 0.029624 0.178054 0.978425
1 0.060726 0.012182 5.106235e-08 0.028127 0.168043 0.980836
2 0.078675 0.013133 5.459318e-08 0.029604 0.173948 0.979692
3 0.070134 0.012170 5.433062e-08 0.028690 0.173441 0.979676
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.069187 0.015272 6.962269e-08 0.032485 0.196696 0.974454
1 0.081883 0.019107 8.319249e-08 0.035558 0.214781 0.969242
2 0.111059 0.027759 7.604825e-08 0.032598 0.204672 0.971143
3 0.072076 0.020104 7.441184e-08 0.034441 0.202766 0.972233
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.024102 0.011912 2.483813e-08 0.019478 0.117310 0.990740
1 0.022750 0.010945 2.352661e-08 0.019070 0.114028 0.991245
2 0.021235 0.010324 2.232480e-08 0.018731 0.111147 0.991633
3 0.025085 0.010732 2.275274e-08 0.018982 0.112220 0.991405
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.043793 0.010340 4.309262e-08 0.026283 0.154149 0.983764
1 0.032649 0.024414 5.713694e-08 0.030150 0.178168 0.978414
2 0.037313 0.012192 4.148151e-08 0.025061 0.151521 0.984578
3 0.029203 0.009908 4.382232e-08 0.025472 0.155687 0.984102
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.018687 0.006935 1.652485e-08 0.016122 0.095637 0.993756
1 0.015942 0.009711 1.937579e-08 0.017367 0.103559 0.992776
2 0.017233 0.005933 1.521875e-08 0.015231 0.091792 0.994281
3 0.018154 0.007785 1.675117e-08 0.016169 0.096239 0.993784
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.028798 0.014864 4.520370e-08 0.025014 0.158115 0.983631
1 0.020090 0.012639 2.689964e-08 0.020471 0.121970 0.989832
2 0.035511 0.013888 4.054986e-08 0.025603 0.149697 0.985080
3 0.036650 0.014280 4.418118e-08 0.026006 0.156564 0.983113
Wall time: 47.1 s
In [82]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.160581 0.069686 5.247102e-07 0.090379 0.538828 0.803571
2 0.072136 0.020935 8.680482e-08 0.036725 0.219137 0.967513
3 0.066310 0.012493 5.433867e-08 0.029011 0.173371 0.979657
4 0.023293 0.010978 2.336057e-08 0.019065 0.113676 0.991256
5 0.017504 0.007591 1.696764e-08 0.016222 0.096807 0.993649
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.171870 0.069686 5.406971e-07 0.091116 0.546691 0.798882
2 0.087404 0.027039 1.097754e-07 0.040915 0.246028 0.959256
3 0.083551 0.020561 7.581882e-08 0.033770 0.204729 0.971768
4 0.035739 0.014214 4.638335e-08 0.026742 0.159881 0.982714
5 0.030262 0.013918 3.920859e-08 0.024274 0.146587 0.985414
In [83]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [84]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)